***********************************************
* Title: rwanda_jde_tableA12.do
* Author: Todd Pugatch
* Last update: June 10 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	student_merge_jde.dta
* Outputs: 	rwanda_jde_tableA12.[txt/out]
* Notes: produces Table A12
************************************************

local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"

* begin log file
log using "$temp/rwanda_jde_tableA12.txt", text replace

* data prep: merge take-up with student surveys, by school
local takeup "training_any_pct exchange_pct"
qui use "$cleandata/student_merge_jde.dta", clear
qui merge m:1 school_code using "$cleandata/rwanda_school_trainattend_jde.dta", keepusing(`takeup')
qui drop if _merge==2
drop _merge
qui save "$temp/studenttemp.dta", replace	

****************************************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ANALYSIS
/*Goal: produce columns 1-10 of Table P6
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.*/
qui use "$temp/studenttemp.dta", clear

/*get mean/sd of selected control group outcomes, per request of Della Vigna et al
	team at Berkeley gathering predictions on project outcomes*/
su dropout personal_business_el if treatment==0

/*DROPOUT*/
/*Column 1: dropout*/
* dropout appears in student endline survey (not head teacher survey, as originally written in JDE Reg. Report Stage 1)
* note that we don't know if those who attrited from survey have dropped out or not
* also, there is no relevant baseline outcome to include here
local i=1
qui ivregress 2sls dropout (exchange_pct=treatment) i.strata, cluster(school_code)
scalar define H4_c`i'b=_b[exchange_pct]
scalar define H4_c`i'se=_se[exchange_pct]
scalar define t=_b[exchange_pct]/_se[exchange_pct]
scalar define dfr=e(N)-e(df_m)
scalar define H4_c`i'p=2*ttail(dfr,abs(t))
local i=`i'+1

/*ECONOMIC ACTIVITY*/	
* winsorize all financial variables used in analysis
local fin_el "earn_last2mths_usd profits_last2mths_adj_usd earn_alt_adj_usd"
local fin_bl "earn_last2mths_usd business_inc_last2mths_usd"
foreach w in el bl {
	foreach x in `fin_`w'' {
		winsor `x'_`w', gen(`x'w_`w') p(.01) highonly
		lab var `x'w_`w' "`x'_`w', winsorized at 99th percentile"
	}
}

* prep missing values for students without baseline outcome
/*first, impute zero for (winsorized) business income for those who don't report owning a business 
	(original version conditioned on business ownership)*/
qui replace business_inc_last2mths_usdw_bl=0 if ownbusiness_bl==0

/*now impute control group mean to remaining missing values*/
local Y0 "ownbusiness_bl buspartners_school_bl buspartners_family_bl ownbusiness_nonag_bl earn_money_bl earn_last2mths_usdw_bl business_inc_last2mths_usdw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*Columns 2-8: business participation, characteristics, employment*/
/*set up outcomes and control variables for loop*/
local Y "personal_business_el business_solo_el business_sbc_el business_famorpeers_el businesstype_nonag_el	business_employs_el	employed_el"
local y02 "ownbusiness_bl"
local y03 "ownbusiness_bl"
local y04 "buspartners_school_bl"
local y05 "buspartners_family_bl"
local y06 "ownbusiness_nonag_bl"
local y07 "ownbusiness_bl"
local y08 "earn_money_bl"

/*regressions: columns 2-8*/
local stat ""control mean",mu_c,"baseline mean",mu_0,"lower bound",lb,"upper bound",ub"
local i=2 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `y0`i''i `y0`i''m i.strata, cluster(school_code)
	scalar define H4_c`i'b=_b[exchange_pct]
	scalar define H4_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H4_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

/*Columns 9-10: income and business profits*/
/*also use alternative measure for column 9, as robustness check*/
local Y "earn_last2mths_usdw_el	profits_last2mths_adj_usdw_el earn_alt_adj_usdw_el"
local y09 "earn_last2mths_usdw_bl"
local y010 "business_inc_last2mths_usdw_bl"
local y011 "earn_last2mths_usdw_bl"
local i=9 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `y0`i''i `y0`i''m i.strata, cluster(school_code)
	scalar define H4_c`i'b=_b[exchange_pct]
	scalar define H4_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H4_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local n=`i'-1
qui set obs `n'
qui gen hypothesis=4
qui gen column=_n
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=1/`n' {
	qui replace b=H4_c`i'b in `i'
	qui replace se=H4_c`i'se in `i'
	qui replace pval=H4_c`i'p in `i' 
}

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************************************
* list results
list hypothesis column b se pval bky06_qval

* prep results for inclusion in final regression output
forval i=1/`n' {
	scalar define H4_c`i'q=bky06_qval in `i'
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
* ANALYSIS
/*Goal: produce columns 1-10 of Table P6
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	Include sharpened q-values from above.*/
qui use "$temp/studenttemp.dta", clear

/*DROPOUT*/
/*Column 1: dropout*/
* dropout appears in student endline survey (not head teacher survey, as originally written in JDE Reg. Report Stage 1)
* note that we don't know if those who attrited from survey have dropped out or not
* also, there is no relevant baseline outcome to include here
local stat ""1st stage F",F,"control mean",mu_c,"q-value",qv"

local i=1
qui ivregress 2sls dropout (exchange_pct=treatment) i.strata, cluster(school_code)
qui estat first
matrix define X=r(singleresults)
scalar define F=X[1,4]	
qui su dropout if treatment==0 & e(sample)
scalar define mu_c=r(mean)
scalar define qv=H4_c`i'q
outreg2 exchange_pct using "$results/rwanda_jde_tableA12.xls", se excel nolabel nocons addstat(`stat') replace

/*ECONOMIC ACTIVITY*/	
* winsorize all financial variables used in analysis
local fin_el "earn_last2mths_usd profits_last2mths_adj_usd earn_alt_adj_usd"
local fin_bl "earn_last2mths_usd business_inc_last2mths_usd"
foreach w in el bl {
	foreach x in `fin_`w'' {
		winsor `x'_`w', gen(`x'w_`w') p(.01) highonly
		lab var `x'w_`w' "`x'_`w', winsorized at 99th percentile"
	}
}

* prep missing values for students without baseline outcome
/*first, impute zero for (winsorized) business income for those who don't report owning a business 
	(original version conditioned on business ownership)*/
qui replace business_inc_last2mths_usdw_bl=0 if ownbusiness_bl==0

/*now impute control group mean to remaining missing values*/
local Y0 "ownbusiness_bl buspartners_school_bl buspartners_family_bl ownbusiness_nonag_bl earn_money_bl earn_last2mths_usdw_bl business_inc_last2mths_usdw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*Columns 2-8: business participation, characteristics, employment*/
/*set up outcomes and control variables for loop*/
local Y "personal_business_el business_solo_el business_sbc_el business_famorpeers_el businesstype_nonag_el	business_employs_el	employed_el"
local y02 "ownbusiness_bl"
local y03 "ownbusiness_bl"
local y04 "buspartners_school_bl"
local y05 "buspartners_family_bl"
local y06 "ownbusiness_nonag_bl"
local y07 "ownbusiness_bl"
local y08 "earn_money_bl"

/*regressions: columns 2-8*/
local stat ""1st stage F",F,"control mean",mu_c,"baseline mean",mu_0,"q-value",qv"
local i=2 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `y0`i''i `y0`i''m i.strata,  cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]	
	qui su `y' if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	qui su `y0`i'' if e(sample)
	scalar define mu_0=r(mean)
	scalar define qv=H4_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA12.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*Columns 9-10: income and business profits*/
/*also use alternative measure for column 9, as robustness check*/
local Y "earn_last2mths_usdw_el	profits_last2mths_adj_usdw_el earn_alt_adj_usdw_el"
local y09 "earn_last2mths_usdw_bl"
local y010 "business_inc_last2mths_usdw_bl"
local y011 "earn_last2mths_usdw_bl"
local i=9 /*initialize loop*/
foreach y in `Y' {	
	qui ivregress 2sls `y' (exchange_pct=treatment) `y0`i''i `y0`i''m i.strata,  cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]	
	qui su `y' if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	qui su `y0`i'' if e(sample)
	scalar define mu_0=r(mean)
	scalar define qv=H4_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA12.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}


erase "$temp/studenttemp.dta"
local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
